# Import required R libraries
library(fpp3)
library(tidyverse)
library(readxl)
library(seasonal)

This project consists of 3 parts - two required and one bonus and is worth 15% of your grade. The project is due at 11:59 PM on Sunday October 31. I will accept late submissions with a penalty until the meetup after that when we review some projects.

Part A – ATM Forecast

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable Cash is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an Excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file. Also please submit the forecast which you will put in an Excel readable file.

# Read in data
atm_data_raw <- read_excel("data/ATM624Data.xlsx")

# Properly convert the DATE column to match true input
# https://stackoverflow.com/questions/43230470/how-to-convert-excel-date-format-to-proper-date-in-r
atm_data_raw <- atm_data_raw %>% mutate(DATE = as.Date(DATE, origin = "1899-12-30"))

# Initial output to see data
head(atm_data_raw)
## # A tibble: 6 × 3
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM1     96
## 2 2009-05-01 ATM2    107
## 3 2009-05-02 ATM1     82
## 4 2009-05-02 ATM2     89
## 5 2009-05-03 ATM1     85
## 6 2009-05-03 ATM2     90
# Output summary for high level assessment
summary(atm_data_raw)
##       DATE                ATM                 Cash        
##  Min.   :2009-05-01   Length:1474        Min.   :    0.0  
##  1st Qu.:2009-08-01   Class :character   1st Qu.:    0.5  
##  Median :2009-11-01   Mode  :character   Median :   73.0  
##  Mean   :2009-10-31                      Mean   :  155.6  
##  3rd Qu.:2010-02-01                      3rd Qu.:  114.0  
##  Max.   :2010-05-14                      Max.   :10919.8  
##                                          NA's   :19
# Check dimensions to understand breadth of data
dim(atm_data_raw)
## [1] 1474    3
# 1474    3
#Cash in hundreds
# Define as tsibble with DATE as index
atm_data_ts <- atm_data_raw %>%
  as_tsibble(index = DATE, key = ATM)

# Output tsibble to confirm proper format and definition
atm_data_ts
## # A tsibble: 1,474 x 3 [1D]
## # Key:       ATM [5]
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM1     96
##  2 2009-05-02 ATM1     82
##  3 2009-05-03 ATM1     85
##  4 2009-05-04 ATM1     90
##  5 2009-05-05 ATM1     99
##  6 2009-05-06 ATM1     88
##  7 2009-05-07 ATM1      8
##  8 2009-05-08 ATM1    104
##  9 2009-05-09 ATM1     87
## 10 2009-05-10 ATM1     93
## # … with 1,464 more rows
# Filter to ATM1 data
atm1_data_ts <- atm_data_ts %>%
  filter(ATM == 'ATM1')

# Output summary
summary(atm1_data_ts)
##       DATE                ATM                 Cash       
##  Min.   :2009-05-01   Length:365         Min.   :  1.00  
##  1st Qu.:2009-07-31   Class :character   1st Qu.: 73.00  
##  Median :2009-10-30   Mode  :character   Median : 91.00  
##  Mean   :2009-10-30                      Mean   : 83.89  
##  3rd Qu.:2010-01-29                      3rd Qu.:108.00  
##  Max.   :2010-04-30                      Max.   :180.00  
##                                          NA's   :3
#atm1_data_ts

atm1_data_ts %>%
  autoplot(Cash)

# Calculate median value for ATM1
median <- median(atm1_data_ts$Cash, na.rm=TRUE)

# Set NAs to median
atm1_data_ts$Cash[is.na(atm1_data_ts$Cash)] <- median

summary(atm1_data_ts)
##       DATE                ATM                 Cash       
##  Min.   :2009-05-01   Length:365         Min.   :  1.00  
##  1st Qu.:2009-07-31   Class :character   1st Qu.: 73.00  
##  Median :2009-10-30   Mode  :character   Median : 91.00  
##  Mean   :2009-10-30                      Mean   : 83.95  
##  3rd Qu.:2010-01-29                      3rd Qu.:108.00  
##  Max.   :2010-04-30                      Max.   :180.00
head(atm1_data_ts)
## # A tsibble: 6 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2009-05-01 ATM1     96
## 2 2009-05-02 ATM1     82
## 3 2009-05-03 ATM1     85
## 4 2009-05-04 ATM1     90
## 5 2009-05-05 ATM1     99
## 6 2009-05-06 ATM1     88
atm1_data_ts <- atm1_data_ts %>%
  mutate(DATE = as_date(DATE)) %>%
  update_tsibble(index = DATE)

atm1_data_ts
## # A tsibble: 365 x 3 [1D]
## # Key:       ATM [1]
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM1     96
##  2 2009-05-02 ATM1     82
##  3 2009-05-03 ATM1     85
##  4 2009-05-04 ATM1     90
##  5 2009-05-05 ATM1     99
##  6 2009-05-06 ATM1     88
##  7 2009-05-07 ATM1      8
##  8 2009-05-08 ATM1    104
##  9 2009-05-09 ATM1     87
## 10 2009-05-10 ATM1     93
## # … with 355 more rows
# From: https://stats.stackexchange.com/questions/494013/control-the-period-for-daily-time-series-in-tsibbles
atm1_dcmp <- atm1_data_ts %>%
  model(stl = STL(Cash ~ trend(window=Inf) + season(period=7, window="periodic"))) 

atm1_dcmp %>% components() %>% autoplot()

components(atm1_dcmp) %>%
  as_tsibble() %>%
  filter_index("2009-05-01" ~ "2010-04-30") %>%
  autoplot(Cash, colour = "gray") +
  geom_line(aes(y=season_adjust), colour = "#0072B2") +
  labs(y = "Cash (in hundreds $USD)", title = "Seasonally Adjusted Trendline")

components(atm1_dcmp) %>%
  as_tsibble() %>%
  filter_index("2010-02-16" ~ "2010-04-30") %>%
  autoplot(Cash, colour = "gray") +
  geom_line(aes(y=season_adjust), colour = "#0072B2") +
  labs(y = "Cash (in hundreds $USD)", title = "Seasonally Adjusted Trendline")

# from textbook: If the seasonal component is removed from the original data, the resulting values are the “seasonally adjusted” data.

Above plot shows that the seasonally adjusted data does not account for the weekly seasonal nature for the last three months. There is a shift.

atm1_data_ts %>% ACF() %>% autoplot()

atm1_data_ts %>% PACF() %>% autoplot()

atm1_data_ts %>% gg_season(Cash, period = "week") +
  theme(legend.position = "none") +
  labs(y="$USD (in hundreds)", title="ATM Withdrawals")

atm1_data_ts %>%
  gg_subseries(period = "week") +
  labs(
    y = "$USD (in hundreds)",
    title = "ATM Withdrawals"
  )

# Create training set (assume 1 year of data)
# 292 days for training, approx. 80% of data
train_atm1 <- atm1_data_ts %>% 
  filter_index("2009-05-01" ~ "2010-02-17")

train_atm1
## # A tsibble: 293 x 3 [1D]
## # Key:       ATM [1]
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM1     96
##  2 2009-05-02 ATM1     82
##  3 2009-05-03 ATM1     85
##  4 2009-05-04 ATM1     90
##  5 2009-05-05 ATM1     99
##  6 2009-05-06 ATM1     88
##  7 2009-05-07 ATM1      8
##  8 2009-05-08 ATM1    104
##  9 2009-05-09 ATM1     87
## 10 2009-05-10 ATM1     93
## # … with 283 more rows
# Fit the models
fit_atm1 <- train_atm1 %>%
  model(
    Naive = NAIVE(Cash),
    `Seasonal naive` = SNAIVE(Cash),
    `Random walk` = RW(Cash ~ drift()),
    Arima = ARIMA(Cash),
    ETS_Add = ETS(Cash ~ error("A") + trend("N") + season("A")),
    ETS_Mult = ETS(Cash ~ error("M") + trend("N") + season("M")),
    ETS_Damp = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
  )

fit_atm1_arima <- train_atm1 %>%
  model(ARIMA(Cash))

fit_atm1 %>% accuracy()
## # A tibble: 7 × 11
##   ATM   .model     .type         ME  RMSE   MAE   MPE  MAPE  MASE RMSSE     ACF1
##   <chr> <chr>      <chr>      <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 ATM1  Naive      Train…  2.84e- 1  49.9  37.4 -85.0 120.  2.07  1.77  -0.356  
## 2 ATM1  Seasonal … Train…  2.03e- 1  28.1  18.0 -83.3 103.  1     1      0.0957 
## 3 ATM1  Random wa… Train… -2.13e-16  49.9  37.5 -85.7 121.  2.08  1.77  -0.356  
## 4 ATM1  Arima      Train…  1.79e- 1  21.9  14.3 -66.5  82.7 0.790 0.780  0.00341
## 5 ATM1  ETS_Add    Train…  1.75e- 1  21.4  13.7 -73.3  89.7 0.761 0.763  0.0940 
## 6 ATM1  ETS_Mult   Train…  1.98e+ 0  21.6  13.7 -75.8  90.6 0.760 0.769  0.0956 
## 7 ATM1  ETS_Damp   Train…  1.46e+ 0  21.3  13.3 -76.4  90.4 0.737 0.759  0.0756
# Check the residuals of Seasonal Naive
fit_atm1_arima %>% gg_tsresiduals()

# Generate forecasts for 72 days
fc_atm1 <- fit_atm1 %>% forecast(h = "72 days")

fc_atm1 %>% accuracy(atm1_data_ts)
## # A tibble: 7 × 11
##   .model         ATM   .type      ME  RMSE   MAE   MPE  MAPE  MASE RMSSE     ACF1
##   <chr>          <chr> <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 Arima          ATM1  Test    -2.20  50.8  37.8 -466.  502.  2.09  1.81 -0.0565 
## 2 ETS_Add        ATM1  Test    -3.25  52.4  38.0 -479.  514.  2.11  1.86  0.00348
## 3 ETS_Damp       ATM1  Test   -10.4   54.2  40.0 -517.  547.  2.22  1.93 -0.0246 
## 4 ETS_Mult       ATM1  Test    -7.03  52.9  39.2 -494.  527.  2.17  1.88 -0.00808
## 5 Naive          ATM1  Test   -98.2  104.   98.2 -893.  893.  5.44  3.71 -0.0585 
## 6 Random walk    ATM1  Test  -109.   114.  109.  -944.  944.  6.02  4.07  0.0162 
## 7 Seasonal naive ATM1  Test    -5.21  64.1  53.7 -460.  509.  2.97  2.28  0.00878
# Plot forecasts against actual values
fc_atm1 %>%
  autoplot(filter_index(train_atm1, "2010-02-18" ~ "2010-04-30"), level = NULL) +
  autolayer(
    filter_index(atm1_data_ts, "2010-02-18" ~ "2010-04-30"),
    colour = "black"
  ) +
  labs(
    y = "Cash (in hundreds $USD)",
    title = "Forecasts for ATM1 Withdrawals"
  ) +
  guides(colour = guide_legend(title = "Forecast"))

RMSE of ARIMA model on test data: 21.91975, next best is RMSE of Seasonal Naive 28.11888.

The forecasts don’t line up with the actual data well. The seasonal nature of the actual data doesn’t match the seasonal nature of the forecasted data. Further investigation is needed.

### REMOVE THIS SECTION, AS I'VE PUT ALL THE MODELS INTO ONE INITIAL ATTEMPT #####
# ETS

fit_atm1_ets <- train_atm1 %>%
  model(
    add = ETS(Cash ~ error("A") + trend("N") + season("A")),
    mult = ETS(Cash ~ error("M") + trend("N") + season("M")),
    damp = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
  )

# Forecast for 73 days
fc_atm1_ets <- fit_atm1_ets %>% forecast(h = "73 days")

fc_atm1_ets %>%
  autoplot(filter_index(train_atm1, "2009-12-01" ~ "2010-04-30"), level = NULL) +
  autolayer(
    filter_index(atm1_data_ts, "2009-12-01" ~ "2010-04-30"),
    colour = "black"
  ) +
  labs(y="Billions $USD", title="GDP: China") +
  guides(colour = guide_legend(title = "Forecast"))
# ATM1 train on 2010-02-16 to 2010-04-30 (end)
# 15 + 31 + 30 = 76 days, 80% is 61 days
new_seasonal_atm1 <- atm1_data_ts %>% 
  filter_index("2010-02-16" ~ "2010-04-30")

train_atm1 <- atm1_data_ts %>% 
  filter_index("2010-02-16" ~ "2010-04-14")

# Consider Box-Cox
lambda <- new_seasonal_atm1 %>%
  features(Cash, features = guerrero) %>%
  pull(lambda_guerrero)

new_seasonal_atm1 %>%
  autoplot(box_cox(Cash, lambda)) +
  labs(y = "",
       title = latex2exp::TeX(paste0(
         "Transformed ATM1 withdrawals with $\\lambda$ = ",
         round(lambda,2))))

Applying Box-Cox, the SNaive RMSE went up to 10.08937, without Box-Cox 9.975

# Fit the models
fit_atm1 <- train_atm1 %>%
  model(
    `Seasonal naive` = SNAIVE(Cash),
    Arima = ARIMA(Cash),
    ETS_Add = ETS(Cash ~ error("A") + trend("N") + season("A")),
    ETS_Mult = ETS(Cash ~ error("M") + trend("N") + season("M")),
    ETS_Damp = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
  )

report(fit_atm1)
## # A tibble: 5 × 12
##   ATM   .model  sigma2 log_lik   AIC  AICc   BIC ar_roots ma_roots    MSE   AMSE
##   <chr> <chr>    <dbl>   <dbl> <dbl> <dbl> <dbl> <list>   <list>    <dbl>  <dbl>
## 1 ATM1  Seaso… 575.        NA    NA    NA    NA  <NULL>   <NULL>      NA     NA 
## 2 ATM1  Arima  311.      -219.  447.  448.  455. <cpl [9… <cpl [0…    NA     NA 
## 3 ATM1  ETS_A… 327.      -281.  582.  586.  602. <NULL>   <NULL>     276.   330.
## 4 ATM1  ETS_M…   0.155   -308.  636.  641.  657. <NULL>   <NULL>    1046.  1437.
## 5 ATM1  ETS_D…   0.405   -329.  683.  692.  710. <NULL>   <NULL>   27524. 41523.
## # … with 1 more variable: MAE <dbl>
fit_atm1 %>% accuracy()
## # A tibble: 5 × 11
##   ATM   .model         .type      ME  RMSE   MAE    MPE  MAPE  MASE RMSSE    ACF1
##   <chr> <chr>          <chr>   <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 ATM1  Seasonal naive Trai…  -5.20   24.3  15.9 -214.  230.  1     1      0.444 
## 2 ATM1  Arima          Trai…   0.138  16.0  11.9  -97.6 143.  0.753 0.660 -0.127 
## 3 ATM1  ETS_Add        Trai…  -1.19   16.6  11.7  -56.5  68.7 0.739 0.684  0.0722
## 4 ATM1  ETS_Mult       Trai…  -2.40   32.3  20.8 -125.  145.  1.31  1.33   0.480 
## 5 ATM1  ETS_Damp       Trai… -10.4   166.   61.7  -13.8  68.2 3.89  6.82   0.412
fit_atm1_arima<- train_atm1 %>%
  model(ARIMA(Cash))

# Check the residuals of Seasonal Naive
fit_atm1_arima %>% gg_tsresiduals()

# Generate forecasts for 16 days (two weeks), so we'll see if it picks up the seasonal nature
fc_atm1 <- fit_atm1 %>% forecast(h = "16 days")

fc_atm1 %>% accuracy(new_seasonal_atm1)
## # A tibble: 5 × 11
##   .model         ATM   .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE     ACF1
##   <chr>          <chr> <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 Arima          ATM1  Test   7.71  12.8   9.50 40.9  43.2  0.599 0.527  0.0154 
## 2 ETS_Add        ATM1  Test   1.06  11.8   8.88  5.50 15.3  0.560 0.486 -0.337  
## 3 ETS_Damp       ATM1  Test  46.6   54.0  46.6  57.3  58.4  2.94  2.22   0.0345 
## 4 ETS_Mult       ATM1  Test   9.17  15.1  11.0  -2.95 26.2  0.694 0.621 -0.161  
## 5 Seasonal naive ATM1  Test   0.125  9.64  7.12  1.26  9.83 0.449 0.397 -0.00791
# Plot forecasts against actual values
fc_atm1 %>%
  autoplot(filter_index(train_atm1, "2010-04-01" ~ "2010-04-30"), level = NULL) +
  autolayer(
    filter_index(new_seasonal_atm1, "2010-04-01" ~ "2010-04-30"),
    colour = "black"
  ) +
  labs(
    y = "Cash (in hundreds $USD)",
    title = "Forecasts for ATM1 Withdrawals"
  ) +
  guides(colour = guide_legend(title = "Forecast"))

# Forecast for May 2010
fit_atm1 <- new_seasonal_atm1 %>%
  model(ETS_Add = ETS(Cash ~ error("A") + trend("N") + season("A")),
        ARIMA(Cash))

report(fit_atm1)

fit_atm1 %>% accuracy()

# Check the residuals of Seasonal Naive
#fit_atm1 %>% gg_tsresiduals()

# Generate forecasts for 31 days of May 2010
fc_atm1 <- fit_atm1 %>% forecast(h = "31 days")

# Plot forecasts against actual values
fc_atm1 %>%
  autoplot(filter_index(new_seasonal_atm1, "2010-04-01" ~ "2010-04-30"), level = NULL) +
  autolayer(
    filter_index(new_seasonal_atm1, "2010-04-01" ~ "2010-04-30"),
    colour = "black"
  ) +
  labs(
    y = "Cash (in hundreds $USD)",
    title = "Forecasts for ATM1 Withdrawals"
  ) +
  guides(colour = guide_legend(title = "Forecast"))
# ETS: for just the final 76 days also

fit_atm1_ets <- train_atm1 %>%
  model(
    add = ETS(Cash ~ error("A") + trend("N") + season("A")),
    mult = ETS(Cash ~ error("M") + trend("N") + season("M")),
    damp = ETS(Cash ~ error("M") + trend("Ad") + season("M"))
  )

fit_atm1_ets %>% accuracy()

# Forecast for 16 days
fc_atm1_ets <- fit_atm1_ets %>% forecast(h = "16 days")

fc_atm1_ets %>% accuracy(atm1_data_ts)

fc_atm1_ets %>%
  autoplot(filter_index(train_atm1, "2010-02-16" ~ "2010-04-30"), level = NULL) +
  autolayer(
    filter_index(atm1_data_ts, "2010-02-16" ~ "2010-04-30"),
    colour = "black"
  ) +
  labs(y="$USD (in hundreds)", title="ATM1 Withdrawals Forecast") +
  guides(colour = guide_legend(title = "Forecast"))
atm2_data_ts <- atm_data_ts %>%
  filter(ATM == 'ATM2')

atm2_data_ts %>%  autoplot(Cash)

# Calculate median value for ATM2
median <- median(atm2_data_ts$Cash, na.rm=TRUE)

# Set NAs to median
atm2_data_ts$Cash[is.na(atm2_data_ts$Cash)] <- median

atm2_data_ts
## # A tsibble: 365 x 3 [1D]
## # Key:       ATM [1]
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM2    107
##  2 2009-05-02 ATM2     89
##  3 2009-05-03 ATM2     90
##  4 2009-05-04 ATM2     55
##  5 2009-05-05 ATM2     79
##  6 2009-05-06 ATM2     19
##  7 2009-05-07 ATM2      2
##  8 2009-05-08 ATM2    103
##  9 2009-05-09 ATM2    107
## 10 2009-05-10 ATM2    118
## # … with 355 more rows
atm2_data_ts %>%
  model(stl = STL(Cash ~ trend(window=Inf) + season(period=7, window="periodic"))
        ) %>% 
  components() %>% autoplot()

Same thing as ATM1, the seasonal nature of the data changes in the final few months. I’m gonna separate the forecast into just the final section where the weekly seasonal nature shifts.

atm2_data_ts %>% ACF() %>% autoplot()

atm2_data_ts %>% PACF() %>% autoplot()

atm2_data_ts %>% gg_season(Cash, period = "week") +
  theme(legend.position = "none") +
  labs(y="$ (in hundreds)", title="ATM Withdrawals")

atm2_data_ts %>%
  gg_subseries(period = "week") +
  labs(
    y = "$ (in hundreds)",
    title = "ATM Withdrawals"
  )

# From: https://stats.stackexchange.com/questions/494013/control-the-period-for-daily-time-series-in-tsibbles
atm2_dcmp <- atm2_data_ts %>%
  model(stl = STL(Cash ~ trend(window=Inf) + season(period=7, window="periodic"))) 

atm2_dcmp %>% components() %>% autoplot()

components(atm2_dcmp) %>%
  as_tsibble() %>%
  filter_index("2009-05-01" ~ "2010-04-30") %>%
  autoplot(Cash, colour = "gray") +
  geom_line(aes(y=season_adjust), colour = "#0072B2") +
  labs(y = "Cash (in hundreds)", title = "Seasonally Adjusted Trendline")

components(atm2_dcmp) %>%
  as_tsibble() %>%
  filter_index("2010-02-16" ~ "2010-04-30") %>%
  autoplot(Cash, colour = "gray") +
  geom_line(aes(y=season_adjust), colour = "#0072B2") +
  labs(y = "Cash (in hundreds)", title = "Seasonally Adjusted Trendline")

# from textbook: If the seasonal component is removed from the original data, the resulting values are the “seasonally adjusted” data.
atm3_data_ts <- atm_data_ts %>%
  filter(ATM == 'ATM3')

atm3_data_ts %>% autoplot(Cash)

atm3_data_ts <- atm_data_ts %>%
  filter(ATM == 'ATM3', Cash > 0)

atm3_data_ts %>% autoplot(Cash)

summary(atm3_data_ts)
##       DATE                ATM                 Cash      
##  Min.   :2010-04-28   Length:3           Min.   :82.00  
##  1st Qu.:2010-04-28   Class :character   1st Qu.:83.50  
##  Median :2010-04-29   Mode  :character   Median :85.00  
##  Mean   :2010-04-29                      Mean   :87.67  
##  3rd Qu.:2010-04-29                      3rd Qu.:90.50  
##  Max.   :2010-04-30                      Max.   :96.00
atm3_data_ts
## # A tsibble: 3 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2010-04-28 ATM3     96
## 2 2010-04-29 ATM3     82
## 3 2010-04-30 ATM3     85
# Literally just 3 values above 0
atm3_data_ts <- atm_data_ts %>%
  filter(ATM == 'ATM3')

atm3_data_ts %>% autoplot(Cash)

atm3_data_ts <- atm_data_ts %>%
  filter(ATM == 'ATM3', Cash > 0)

atm3_data_ts %>% autoplot(Cash)

summary(atm3_data_ts)
##       DATE                ATM                 Cash      
##  Min.   :2010-04-28   Length:3           Min.   :82.00  
##  1st Qu.:2010-04-28   Class :character   1st Qu.:83.50  
##  Median :2010-04-29   Mode  :character   Median :85.00  
##  Mean   :2010-04-29                      Mean   :87.67  
##  3rd Qu.:2010-04-29                      3rd Qu.:90.50  
##  Max.   :2010-04-30                      Max.   :96.00
atm3_data_ts
## # A tsibble: 3 x 3 [1D]
## # Key:       ATM [1]
##   DATE       ATM    Cash
##   <date>     <chr> <dbl>
## 1 2010-04-28 ATM3     96
## 2 2010-04-29 ATM3     82
## 3 2010-04-30 ATM3     85
# Literally just 3 values above 0

fit_atm3 <- atm3_data_ts %>% model(MEAN(Cash))
# Forecast for 31 days of May 2010
fc_atm3 <- fit_atm3 %>% forecast(h = "31 days")

#fc_atm1_ets %>% accuracy(atm3_data_ts)

fc_atm3 %>%
  autoplot(filter_index(atm3_data_ts, "2010-04-28" ~ "2010-05-31"), level = NULL) +
  autolayer(
    filter_index(atm1_data_ts, "2010-04-28" ~ "2010-05-31"),
    colour = "black"
  ) +
  labs(y="$USD (in hundreds)", title="ATM1 Withdrawals Forecast") +
  guides(colour = guide_legend(title = "Forecast"))

There’s only 3 values with data above zero, so I’m assuming there was absolutely no activity but the ATM3 existed, or else it would have NA.

atm4_data_ts <- atm_data_ts %>%
  filter(ATM == 'ATM4')

atm4_data_ts %>% autoplot(Cash)

summary(atm4_data_ts)
##       DATE                ATM                 Cash          
##  Min.   :2009-05-01   Length:365         Min.   :    1.563  
##  1st Qu.:2009-07-31   Class :character   1st Qu.:  124.334  
##  Median :2009-10-30   Mode  :character   Median :  403.839  
##  Mean   :2009-10-30                      Mean   :  474.043  
##  3rd Qu.:2010-01-29                      3rd Qu.:  704.507  
##  Max.   :2010-04-30                      Max.   :10919.762
atm4_data_ts %>%
  model(stl = STL(Cash ~ trend(window=Inf) + season(period=7, window="periodic"))
        ) %>% 
  components() %>% autoplot()

atm4_data_ts
## # A tsibble: 365 x 3 [1D]
## # Key:       ATM [1]
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM4  777. 
##  2 2009-05-02 ATM4  524. 
##  3 2009-05-03 ATM4  793. 
##  4 2009-05-04 ATM4  908. 
##  5 2009-05-05 ATM4   52.8
##  6 2009-05-06 ATM4   52.2
##  7 2009-05-07 ATM4   55.5
##  8 2009-05-08 ATM4  559. 
##  9 2009-05-09 ATM4  904. 
## 10 2009-05-10 ATM4  879. 
## # … with 355 more rows
median <- median(atm4_data_ts$Cash, na.rm=TRUE)

# Set NAs to median
atm4_data_ts$Cash[atm4_data_ts$Cash > 3000] <- median

atm4_data_ts %>% ACF() %>% autoplot()

atm4_data_ts %>% PACF() %>% autoplot()

atm4_data_ts %>% gg_season(Cash, period = "week") +
  theme(legend.position = "none") +
  labs(y="$ (in hundreds)", title="ATM Withdrawals")

atm4_data_ts %>%
  gg_subseries(period = "week") +
  labs(
    y = "$ (in hundreds)",
    title = "ATM Withdrawals"
  )

Summary output DATE ATM Cash
Min. :39934 Length:1474 Min. : 0.0
1st Qu.:40026 Class :character 1st Qu.: 0.5
Median :40118 Mode :character Median : 73.0
Mean :40118 Mean : 155.6
3rd Qu.:40210 3rd Qu.: 114.0
Max. :40312 Max. :10919.8
NA’s :19

Date 39934 to 40312

379 dates

ATM1, ATM2, ATM3, ATM4, NA

Cash 19 NA’s

Data observations: ATM1: 3 Cash NA values ATM2: 2 Cash NA values ATM3: Only 3 Cash values with something above zero ATM4: Many Cash values with a decimal, but not all, something weird there, also ATM4 appears to have 1 really crazy outlier Final 14 entries are NA, NA (DATE of 40299 and higher are NA, NA)

Dimensions output [1] 1474 3

So remove the last 14 and all that remains is 1460. And 1460 / 4 is 365, so thus one year.

Total by day and see if there is an overall relationship to be understood

# Calculate median value for ATM
# Straightforward approach to impute data
median <- median(atm_data_ts$Cash, na.rm=TRUE)

# Set NAs to median
atm_data_ts$Cash[is.na(atm_data_ts$Cash)] <- median

# Get rid of outlier from ATM4

atm_data_ts$Cash[atm_data_ts$Cash > 5000] <- median 


atm_data_ts %>% autoplot(Cash)

atm_data_total_by_day <- atm_data_ts %>%
  index_by(DATE) %>%
  summarize(Cash_Total = sum(Cash))

atm_data_total_by_day
## # A tsibble: 379 x 2 [1D]
##    DATE       Cash_Total
##    <date>          <dbl>
##  1 2009-05-01      980. 
##  2 2009-05-02      695. 
##  3 2009-05-03      968. 
##  4 2009-05-04     1053. 
##  5 2009-05-05      231. 
##  6 2009-05-06      159. 
##  7 2009-05-07       65.5
##  8 2009-05-08      766. 
##  9 2009-05-09     1098. 
## 10 2009-05-10     1090. 
## # … with 369 more rows
atm_data_total_by_day %>% autoplot(Cash_Total)

atm_data_total_by_day %>%
  model(stl = STL(Cash_Total ~ trend(window=Inf) + season(period=7, window="periodic"))
        ) %>% 
  components() %>% autoplot()

# Seasonally adjusted plot
atm_dcmp <- atm_data_total_by_day %>%
  model(stl = STL(Cash_Total ~ trend(window=Inf) + season(period=7, window="periodic"))) 

atm_dcmp %>% components() %>% autoplot()

components(atm_dcmp) %>%
  as_tsibble() %>%
  autoplot(Cash_Total, colour = "gray") +
  geom_line(aes(y=season_adjust), colour = "#0072B2") +
  labs(y = "Cash (in hundreds)", title = "Seasonally Adjusted Trendline")

Part B – Forecasting Power

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

# Read in data
power_data_raw <- read_excel("data/ResidentialCustomerForecastLoad-624.xlsx")

# Change column name to 'Month'
names(power_data_raw)[names(power_data_raw) == 'YYYY-MMM'] <- 'Month'

# Display first 5 rows for visual inspection
head(power_data_raw)
## # A tibble: 6 × 3
##   CaseSequence Month        KWH
##          <dbl> <chr>      <dbl>
## 1          733 1998-Jan 6862583
## 2          734 1998-Feb 5838198
## 3          735 1998-Mar 5420658
## 4          736 1998-Apr 5010364
## 5          737 1998-May 4665377
## 6          738 1998-Jun 6467147
# Display summary for initial assessment
summary(power_data_raw)
##   CaseSequence      Month                KWH          
##  Min.   :733.0   Length:192         Min.   :  770523  
##  1st Qu.:780.8   Class :character   1st Qu.: 5429912  
##  Median :828.5   Mode  :character   Median : 6283324  
##  Mean   :828.5                      Mean   : 6502475  
##  3rd Qu.:876.2                      3rd Qu.: 7620524  
##  Max.   :924.0                      Max.   :10655730  
##                                     NA's   :1
# Display dimensions for assessment
# should be 16 years of data 1998-2013
# Yep, 192/12 = 16
dim(power_data_raw)
## [1] 192   3
# 192   3
# KWH

power_data_ts <- power_data_raw %>%
  mutate(Month = yearmonth(Month)) %>%
  mutate(KWH = KWH/1e3) %>% # In thousands
  as_tsibble(index = Month)

# Output first 5 rows of tsibble
head(power_data_ts)
## # A tsibble: 6 x 3 [1M]
##   CaseSequence    Month   KWH
##          <dbl>    <mth> <dbl>
## 1          733 1998 Jan 6863.
## 2          734 1998 Feb 5838.
## 3          735 1998 Mar 5421.
## 4          736 1998 Apr 5010.
## 5          737 1998 May 4665.
## 6          738 1998 Jun 6467.
# Inital plot of data
power_data_ts %>%
  autoplot(KWH)

power_data_ts %>%
  filter_index("2010" ~ "2011") %>%
  print()
## # A tsibble: 13 x 3 [1M]
##    CaseSequence    Month   KWH
##           <dbl>    <mth> <dbl>
##  1          877 2010 Jan 9397.
##  2          878 2010 Feb 8391.
##  3          879 2010 Mar 7348.
##  4          880 2010 Apr 5776.
##  5          881 2010 May 4919.
##  6          882 2010 Jun 6696.
##  7          883 2010 Jul  771.
##  8          884 2010 Aug 7923.
##  9          885 2010 Sep 7819.
## 10          886 2010 Oct 5876.
## 11          887 2010 Nov 4801.
## 12          888 2010 Dec 6153.
## 13          889 2011 Jan 8395.

First observations, data is Monthly, so I’d expect a seasonal component. 1 month is missing KWH has 1 NA value (2008 Sep) Considering imputing with median Sep Value Outlier (with very small value in July 2010) Considering imputing with median Jul Value

power_data_ts
## # A tsibble: 192 x 3 [1M]
##    CaseSequence    Month   KWH
##           <dbl>    <mth> <dbl>
##  1          733 1998 Jan 6863.
##  2          734 1998 Feb 5838.
##  3          735 1998 Mar 5421.
##  4          736 1998 Apr 5010.
##  5          737 1998 May 4665.
##  6          738 1998 Jun 6467.
##  7          739 1998 Jul 8915.
##  8          740 1998 Aug 8607.
##  9          741 1998 Sep 6990.
## 10          742 1998 Oct 6346.
## # … with 182 more rows
library(stringr)
# I want all the September values
sep_data <- power_data_ts %>%
  filter(str_detect(Month, "Sep"))

sep_data <- as_tibble(sep_data)

sep_kwh_med <- sep_data %>% 
  summarise(Median = median(KWH, na.rm = TRUE))

sep_kwh_med
## # A tibble: 1 × 1
##   Median
##    <dbl>
## 1  7667.
# Median 7666.97
#power_data_ts[power_data_ts$Month == "2008 Sep"] <- sep_kwh_med

power_data_ts[129, 3] <- sep_kwh_med

power_data_ts
## # A tsibble: 192 x 3 [1M]
##    CaseSequence    Month   KWH
##           <dbl>    <mth> <dbl>
##  1          733 1998 Jan 6863.
##  2          734 1998 Feb 5838.
##  3          735 1998 Mar 5421.
##  4          736 1998 Apr 5010.
##  5          737 1998 May 4665.
##  6          738 1998 Jun 6467.
##  7          739 1998 Jul 8915.
##  8          740 1998 Aug 8607.
##  9          741 1998 Sep 6990.
## 10          742 1998 Oct 6346.
## # … with 182 more rows
# I want all the July values
jul_data <- power_data_ts %>%
  filter(str_detect(Month, "Jul"))

jul_data <- as_tibble(jul_data)

jul_kwh_med <- jul_data %>% 
  summarise(Median = median(KWH, na.rm = TRUE))

jul_kwh_med
## # A tibble: 1 × 1
##   Median
##    <dbl>
## 1  7679.
# Median 7678.623   

power_data_ts[151, 3] <- jul_kwh_med

power_data_ts %>% autoplot(KWH)

Perhaps a Box-Cox is needed here, there does seem to be trending upward

# STL
power_data_ts %>%
  model(stl = STL(KWH ~ trend(window=Inf) + season(period=12, window="periodic"))) %>% 
  components() %>% autoplot()

power_dcmp <- power_data_ts %>%
  model(stl = STL(KWH ~ trend(window=Inf) + season(period=12, window="periodic"))) 

power_dcmp %>% components() %>% autoplot()

components(power_dcmp) %>%
  as_tsibble() %>%
  autoplot(KWH, colour = "gray") +
  geom_line(aes(y=season_adjust), colour = "#0072B2") +
  labs(y = "KWH (in thousands)", title = "Seasonally Adjusted Trendline")

# from textbook: If the seasonal component is removed from the original data, the resulting values are the “seasonally adjusted” data.

Seasonally adjusted doesn’t look too bad

# Seasonal differencing
power_data_ts %>%
  gg_tsdisplay(difference(KWH, 12),
               plot_type='partial', lag=36) +
  labs(title="Seasonally differenced", y="")

# ARIMA model #power_data_ts %>% gg_tsdisplay()

https://otexts.com/fpp3/seasonal-arima.html

arima_mod_power <- power_data_ts %>% model(ARIMA(KWH, stepwise=F, approx=F))

report(arima_mod_power)

arima_mod_power %>% gg_tsresiduals(lag=36)

forecast(arima_mod_power, h=36) %>% autoplot(power_data_ts) + labs(title = “KWH Used”, y=“KWH (in thousands)”)

# Forecasting
# ATM1 train on 2010-02-16 to 2010-04-30 (end)
# 15 + 31 + 30 = 76 days, 80% is 61 days
# Create training set (365 total, perhaps assume 1 year of data)
# 292 days, yes, using the generating dates based on the integer values


# Train on 154 months, which is 12 years and 8 months (80% of the total)
# Total is 192 months (16 years)

train_power_data <- power_data_ts %>% 
  filter_index("1998-Jan" ~ "2010-Oct")

lambda <- power_data_ts %>%
  features(KWH, features = guerrero) %>%
  pull(lambda_guerrero)

train_power_data %>%
  autoplot(box_cox(KWH, lambda)) +
  labs(y = "",
       title = latex2exp::TeX(paste0(
         "Transformed ATM1 withdrawals with $\\lambda$ = ",
         round(lambda,2))))

# Fit the models
fit_power <- train_power_data %>%
  model(
    Naive = NAIVE(box_cox(KWH, lambda)),
    `Seasonal naive` = SNAIVE(box_cox(KWH, lambda)),
    `Random walk` = RW(box_cox(KWH, lambda)),
    Arima_BC = ARIMA(box_cox(KWH, lambda)),
    Arima = ARIMA(KWH)
  )

fit_power$Arima_BC
## <lst_mdl[1]>
## [1] <ARIMA(1,0,1)(0,1,1)[12] w/ drift>
fit_power$Arima
## <lst_mdl[1]>
## [1] <ARIMA(0,0,3)(0,1,1)[12] w/ drift>
#fit_atm1_snaive <- train_atm1 %>%
#  model(SNAIVE(Cash))

fit_power %>% accuracy()
## # A tibble: 5 × 10
##   .model         .type        ME  RMSE   MAE    MPE  MAPE  MASE RMSSE     ACF1
##   <chr>          <chr>     <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 Naive          Training  -6.45 1289. 1069. -2.21  17.2  1.92  1.80   0.182  
## 2 Seasonal naive Training  69.0   715.  557.  0.478  8.79 1     1      0.267  
## 3 Random walk    Training  -6.45 1289. 1069. -2.21  17.2  1.92  1.80   0.182  
## 4 Arima_BC       Training   7.10  518.  397. -0.309  6.34 0.713 0.725  0.0885 
## 5 Arima          Training -11.7   499.  383. -0.687  6.16 0.688 0.698 -0.00393
# Check the residuals of Seasonal Naive
#fit_atm1_snaive %>% gg_tsresiduals()

# Generate forecasts for 38 months, so we'll see if it picks up the seasonal nature
fc_power <- fit_power %>% forecast(h = "38 months")

fc_power %>% accuracy(power_data_ts)
## # A tibble: 5 × 10
##   .model         .type     ME  RMSE   MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>          <chr>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Arima          Test    661. 1068.  777.   7.64  9.57  1.40  1.49 0.224
## 2 Arima_BC       Test    621. 1034.  757.   7.25  9.42  1.36  1.45 0.229
## 3 Naive          Test  -1348. 2580. 2155. -23.4  32.0   3.87  3.61 0.689
## 4 Random walk    Test  -1348. 2580. 2155. -23.4  32.0   3.87  3.61 0.689
## 5 Seasonal naive Test    303. 1018.  850.   2.77 11.0   1.53  1.42 0.419
# Plot forecasts against actual values
fc_power %>%
  autoplot(power_data_ts, level = NULL) +
  autolayer(
    filter_index(power_data_ts, "1998-Jan" ~ "2010-Oct"),
    colour = "black"
  ) +
  labs(
    y = "KWH (in thousands)",
    title = "Power Forecast"
  ) +
  guides(colour = guide_legend(title = "Forecast"))

# Result: Not good, SNAIVE at least attempts the seasonal nature

ARIMA_BC is the best RMSE Need to try the ETS

Part C – Waterflow (optional)

Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.

# Read in data
pipe1_data_raw <- read_excel("data/Waterflow_Pipe1.xlsx", col_types = c("date", "numeric"))
pipe2_data_raw <- read_excel("data/Waterflow_Pipe2.xlsx", col_types = c("date", "numeric"))

# From: https://stackoverflow.com/questions/53635818/convert-datetime-from-excel-to-r

pipe1_data_raw$`Date Time` <- as.POSIXct(pipe1_data_raw$`Date Time`,
                              origin="1899-12-30",
                              tz="GMT")

pipe2_data_raw$`Date Time` <- as.POSIXct(pipe2_data_raw$`Date Time`,
                              origin="1899-12-30",
                              tz="GMT")


# Initial output to see data
head(pipe1_data_raw)
## # A tibble: 6 × 2
##   `Date Time`         WaterFlow
##   <dttm>                  <dbl>
## 1 2015-10-23 00:24:06     23.4 
## 2 2015-10-23 00:40:02     28.0 
## 3 2015-10-23 00:53:51     23.1 
## 4 2015-10-23 00:55:40     30.0 
## 5 2015-10-23 01:19:17      6.00
## 6 2015-10-23 01:23:58     15.9
head(pipe2_data_raw)
## # A tibble: 6 × 2
##   `Date Time`         WaterFlow
##   <dttm>                  <dbl>
## 1 2015-10-23 01:00:00      18.8
## 2 2015-10-23 02:00:00      43.1
## 3 2015-10-23 03:00:00      38.0
## 4 2015-10-23 04:00:00      36.1
## 5 2015-10-23 05:00:00      31.9
## 6 2015-10-23 06:00:00      28.2
summary(pipe1_data_raw)
##    Date Time                     WaterFlow     
##  Min.   :2015-10-23 00:24:06   Min.   : 1.067  
##  1st Qu.:2015-10-25 11:21:35   1st Qu.:13.683  
##  Median :2015-10-27 20:07:30   Median :19.880  
##  Mean   :2015-10-27 20:49:15   Mean   :19.897  
##  3rd Qu.:2015-10-30 08:24:51   3rd Qu.:26.159  
##  Max.   :2015-11-01 23:35:43   Max.   :38.913
summary(pipe2_data_raw)
##    Date Time                     WaterFlow     
##  Min.   :2015-10-23 01:00:00   Min.   : 1.885  
##  1st Qu.:2015-11-02 10:45:00   1st Qu.:28.140  
##  Median :2015-11-12 20:30:00   Median :39.682  
##  Mean   :2015-11-12 20:30:00   Mean   :39.556  
##  3rd Qu.:2015-11-23 06:15:00   3rd Qu.:50.782  
##  Max.   :2015-12-03 16:00:00   Max.   :78.303
dim(pipe1_data_raw)
## [1] 1000    2
dim(pipe2_data_raw)
## [1] 1000    2

Define as tsibble

atm_data_ts <- atm_data_raw %>% as_tsibble(index = DATE, key = ATM)

atm_data_ts ```

#30#